# Importing relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss, roc_auc_score, roc_curve, auc, plot_roc_curve, plot_confusion_matrix,confusion_matrix, classification_report
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
pd.set_option("display.max_rows", 250)
import warnings
warnings.filterwarnings('ignore')
# Reading in the dataset
path = "diabetes.csv"
diabetes = pd.read_csv(path)
# Whilst conduting EDA, we discovered a large amount of '0' values in for some important variables. Therefore it was
# decided to impute these '0' values with the mean value for the respective variable
postive = diabetes.loc[diabetes["Outcome"] == 1] # has diabetes
negative = diabetes.loc[diabetes["Outcome"] == 0] # does not have diabetes
postive = postive.replace({"Glucose": 0},
np.mean(postive["Glucose"]))
postive = postive.replace({"BloodPressure": 0},
np.mean(postive["BloodPressure"]))
postive = postive.replace({"SkinThickness": 0},
np.mean(postive["SkinThickness"]))
postive = postive.replace({"BMI": 0},
np.mean(postive["BMI"]))
postive = postive.replace({"Insulin": 0},
np.mean(postive["Insulin"]))
negative = negative.replace({"Glucose": 0},
np.mean(negative["Glucose"]))
negative = negative.replace({"BloodPressure": 0},
np.mean(negative["BloodPressure"]))
negative = negative.replace({"SkinThickness": 0},
np.mean(negative["SkinThickness"]))
negative = negative.replace({"BMI": 0},
np.mean(negative["BMI"]))
negative = negative.replace({"Insulin": 0},
np.mean(negative["Insulin"]))
# Creating our test, train and validation sets from the dataset. Note we only scale our X variable given our
# Y variable has the values 0 or 1
y = diabetes["Outcome"]
diabetes.drop(["Outcome"], axis = 1, inplace = True)
X = diabetes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X, X_valdation, y, y_validation = train_test_split(X_train, y_train, test_size = 0.25,random_state = 42)
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.fit_transform(X_test)
scaler.fit_transform(X_valdation)
# Defining the SVM parameters we would like to tune using grid search
svm_params = [
{"kernel": ["linear", "poly", "rbf"],
"C" : [1, 10, 100]}
]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(
SVC(), svm_params, scoring='%s_macro' % score, cv =10
)
clf.fit(X_train, y_train)
print("Best parameters set found on training set:")
print()
print(clf.best_params_)
print()
print("Grid scores on training set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Classification report for training set:")
print()
y_true_train, y_pred_train = y_train, clf.predict(X_train)
print(classification_report(y_true_train, y_pred_train))
print()
print("Classification report for validation set:")
print()
y_true_val, y_pred_val = y_validation, clf.predict(X_valdation)
print(classification_report(y_true_val, y_pred_val))
print()
# Implementing the SVM with the optimal parameters found from the grid search
svm = SVC(C = 10, kernel = "poly")
svm.fit(X_train, y_train)
# Plotting the ROC curve for our training set
lw=2
plt.style.use("seaborn")
fig, ax_roc = plt.subplots(1, figsize = (7, 7))
plot_roc_curve(svm, X_train, y_train, ax = ax_roc, color = "darkorange")
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
ax_roc.set_title('ROC curve for SVM (Training)')
ax_roc.grid(linestyle='--')
plt.legend()
plt.show()
# Plotting the ROC curve for our Validation set
lw = 2
plt.style.use("seaborn")
fig, ax_roc = plt.subplots(1, figsize = (7, 7))
plot_roc_curve(svm, X_valdation, y_validation, ax = ax_roc, color = "darkorange")
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
ax_roc.set_title('ROC curve for SVM (Validation)')
ax_roc.grid(linestyle='--')
plt.legend()
plt.show()
X_set, y_set = X_valdation, y_validation
from matplotlib.colors import ListedColormap
X1, X2 = np.meshgrid(np.arange(start = X_set.iloc[:, 0].min() - 1, stop = X_set.iloc[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set.iloc[:, 1].min() - 1, stop = X_set.iloc[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(6)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x13
pred = svm.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred,
alpha = 0.75, cmap = ListedColormap(('red', 'green')));
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set.loc[y_set == j, X_set.columns[4]], X_set.loc[y_set == j, X_set.columns[5]],
c = ListedColormap(('red', 'green'))(i), label = j);
plt.title('Validation Set')
plt.xlabel('Insulin')
plt.ylabel('BMI')
plt.legend();
plt.show();
X_set, y_set = X_valdation, y_validation
from matplotlib.colors import ListedColormap
X1, X2 = np.meshgrid(np.arange(start = X_set.iloc[:, 0].min() - 1, stop = X_set.iloc[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set.iloc[:, 1].min() - 1, stop = X_set.iloc[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(6)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x13
pred = svm.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred,
alpha = 0.75, cmap = ListedColormap(('red', 'green')));
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set.loc[y_set == j, X_set.columns[1]], X_set.loc[y_set == j, X_set.columns[2]],
c = ListedColormap(('red', 'green'))(i), label = j);
plt.title('Validation Set')
plt.xlabel('Glucose')
plt.ylabel('Blood Pressure')
plt.legend();
plt.show();
X_set, y_set = X_valdation, y_validation
from matplotlib.colors import ListedColormap
X1, X2 = np.meshgrid(np.arange(start = X_set.iloc[:, 0].min() - 1, stop = X_set.iloc[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set.iloc[:, 1].min() - 1, stop = X_set.iloc[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(6)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x13
pred = svm.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred,
alpha = 0.75, cmap = ListedColormap(('red', 'green')));
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set.loc[y_set == j, X_set.columns[0]], X_set.loc[y_set == j, X_set.columns[1]],
c = ListedColormap(('red', 'green'))(i), label = j);
plt.title('Validation Set')
plt.xlabel('Pregnancies')
plt.ylabel('Glucose')
plt.legend();
plt.show();
X_set, y_set = X_valdation, y_validation
from matplotlib.colors import ListedColormap
X1, X2 = np.meshgrid(np.arange(start = X_set.iloc[:, 0].min() - 1, stop = X_set.iloc[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set.iloc[:, 1].min() - 1, stop = X_set.iloc[:, 1].max() + 1, step = 0.01))
Xpred = np.array([X1.ravel(), X2.ravel()] + [np.repeat(0, X1.ravel().size) for _ in range(6)]).T
# Xpred now has a grid for x1 and x2 and average value (0) for x3 through x13
pred = svm.predict(Xpred).reshape(X1.shape) # is a matrix of 0's and 1's !
plt.contourf(X1, X2, pred,
alpha = 0.75, cmap = ListedColormap(('red', 'green')));
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set.loc[y_set == j, X_set.columns[5]], X_set.loc[y_set == j, X_set.columns[7]],
c = ListedColormap(('red', 'green'))(i), label = j);
plt.title('Validation Set')
plt.xlabel('BMI')
plt.ylabel('Age')
plt.legend();
plt.show();
import pickle
with open("optimal_svm", "wb") as op_svm:
pickle.dump(clf, op_svm)
The SVM with that acheived the most optimal results had the parameters SVC(C=10, kernel='poly').